## rm(list = ls())

print(Sys.time())

options(stringsAsFactors=F)

suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(stringi))
suppressPackageStartupMessages(library(stm))


## install_github("wilryh/parrot", dependencies=TRUE)
suppressPackageStartupMessages(library(parrot))


russia_short <- suppressMessages(read_delim(
    "data/ira_tweets_csv_hashed.csv",
    delim=",", comment="", quote="\"", n_max = 10
)) #this is only to get n columns

russia <- read_delim(
    "data/ira_tweets_csv_hashed.csv",
    delim=",", comment="", quote="\"", 
    col_types=paste(rep("c", ncol(russia_short)), collapse="")
) #read everything character to keep long IDs accurate



russia_sampled_election16 <- subset(
    russia,
    substr(tweet_time, 1, 10) <= "2016-11-08"
)

## summary stats for SI
write.table(
    data.frame(variable="nrows ira_tweets_csv_hash.csv", value=nrow(russia)),
    file="logs/process_russian_account_tweets_replicate_psrm.log",
    append=F,
    sep=":", col.names=F, row.names=F
)
write.table(
    data.frame(variable="n unique users ira_tweets_csv_hash.csv", value=length(unique(russia$userid))),
    file="logs/process_russian_account_tweets_replicate_psrm.log",
    append=T,
    sep=":", col.names=F, row.names=F
)
## 
write.table(
    data.frame(variable="nrows russia_sampled_election16", value=nrow(russia_sampled_election16)),
    file="logs/process_russian_account_tweets_replicate_psrm.log",
    append=T,
    sep=":", col.names=F, row.names=F
)
write.table(
    data.frame(variable="u unique users russia_sampled_election16", value=length(unique(russia_sampled_election16$userid))),
    file="logs/process_russian_account_tweets_replicate_psrm.log",
    append=T,
    sep=":", col.names=F, row.names=F
)

rm(russia); rm(russia_short)

for (tweets_or_descriptions in c("tweets")) { #account_descriptions
    
    if (tweets_or_descriptions=="tweets") {
        processed_election16 <- textProcessor(
            gsub(
                "\032", "", stringi::stri_enc_toascii(
                                         russia_sampled_election16$tweet_text
                                     )
            ),
            data.frame(russia_sampled_election16),
            removestopwords=T, lowercase=T, stem=F
        )
    } else {
        processed_election16 <- textProcessor(
            gsub(
                "\032", "", stringi::stri_enc_toascii(
                                         russia_sampled_election16$user_profile_description
                                     )
            ),
            data.frame(russia_sampled_election16),
            removestopwords=T, lowercase=T, stem=F
        )
    }
    save(
        processed_election16,
        file=paste0(
            "data/russia_",tweets_or_descriptions,
            "_processed_election16_replicate_psrm.RData"
        )
    )
    ## 
    out_election16 <- prepDocuments(
        processed_election16$documents,
        processed_election16$vocab,
        processed_election16$meta,
        lower.thresh=0
    )
    tdm_election16 <- doc_to_tdm(out_election16)
    meta_election16 <- out_election16$meta
    save(
        meta_election16,
        tdm_election16,
        file=paste0(
            "data/russia_",tweets_or_descriptions,
            "_meta_tdm_election16_replicate_psrm_low_thresh0.RData"
        )
    )
    ##  (used in glove check only)
    save(
        out_election16,
        file=paste0("data/russia_",tweets_or_descriptions,"_out_election16_replicate_psrm_low_thresh0.RData"
                    )
    )
    rm(processed_election16); rm(out_election16)
}

print(Sys.time())
